{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 功能：用于核查分词结果是否准确，消除语义、提升优化分词效果；抽取行业实体名词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1、自定义函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#创建用户词典list\n",
    "def getuserdict(filepath):\n",
    "    with open(filepath,encoding = 'utf-8') as f:\n",
    "        userdict = pd.read_table(f,header = None)\n",
    "        userdict = list(userdict.iloc[:,0])\n",
    "    return userdict\n",
    "\n",
    "#创建正确词典list\n",
    "def getcorrectdict(filepath):\n",
    "    with open(filepath,encoding = 'utf-8') as f:\n",
    "        correctdict = f.readlines()\n",
    "    correct_ = []\n",
    "    for i in correctdict:\n",
    "        correct_.append(i.strip())\n",
    "    return correct_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#创建停用词list\n",
    "def stopwordslist(filepath):\n",
    "    stopwords = [line.strip() for line in open(filepath,'r').readlines()]\n",
    "    return stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取评论句子\n",
    "def getcomment(filepath):\n",
    "    with open(filepath,encoding='utf-8',errors='ignore') as f:\n",
    "        data = pd.read_csv(f)\n",
    "        comment = list(data['content'])\n",
    "    return comment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "#一条评论分词观察情况方法\n",
    "def filter_sentence(sentence):  \n",
    "    outer = '' \n",
    "    jieba.load_userdict(r'D:\\分析实战\\京东评论文本挖掘\\data\\user_dict.txt')\n",
    "    for word in jieba.cut(sentence,cut_all = False):  \n",
    "        if word in stopwords or word in userdict or word in correctdict:\n",
    "            continue\n",
    "        else:\n",
    "            outer += word\n",
    "            outer += ' '    \n",
    "\n",
    "    return outer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2、打标工具"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "停用词点 ['———', '》），', '）÷（１－']\n",
      "语料信息： ['还可以刷脸解锁，帮朋友买的，她很满意', '第一次买vivo，真心不错，1498的机子，没想到照相很清晰，性价比很高，买值了，还送了小音响，用了，真不错，机子贴膜都贴好了，不用自己单贴膜，拿起来就用，系统也不错，很快，国产的品牌真挺不错的。', '手机好用快递送的快。']\n"
     ]
    }
   ],
   "source": [
    "stopwords = stopwordslist(r'D:\\分析实战\\京东评论文本挖掘\\data\\哈工大停用词表.txt')\n",
    "comment = getcomment(r\"D:\\分析实战\\京东评论文本挖掘\\data\\京东评论数据.csv\")\n",
    "\n",
    "print('停用词点',stopwords[:3])\n",
    "print('语料信息：',comment[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "请输入起始序号：1\n",
      "请输入结尾序号：3\n"
     ]
    }
   ],
   "source": [
    "#输入需审核的语料序号\n",
    "begin = int(input('请输入起始序号：'))\n",
    "end = int(input('请输入结尾序号：'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "原始语料情况： 第一次买vivo，真心不错，1498的机子，没想到照相很清晰，性价比很高，买值了，还送了小音响，用了，真不错，机子贴膜都贴好了，不用自己单贴膜，拿起来就用，系统也不错，很快，国产的品牌真挺不错的。\n",
      "待审核句子： 贴 单 \n"
     ]
    }
   ],
   "source": [
    "while begin <= end:\n",
    "    #初始化词典\n",
    "    userdict = getuserdict(r'D:\\分析实战\\京东评论文本挖掘\\data\\user_dict.txt')\n",
    "    correctdict = getcorrectdict(r'D:\\分析实战\\京东评论文本挖掘\\data\\correct_dict.txt')\n",
    "    \n",
    "    com = comment[begin]\n",
    "    print(\"原始语料情况：\",com)\n",
    "    \n",
    "    wordlist = filter_sentence(com)\n",
    "    print(\"待审核句子：\", wordlist)\n",
    "    new_user_dict = input('请输入切分错的词组的正确形式，词组与词组间以' '(space键)形式分开：')\n",
    "    new_user_dict = new_user_dict.split(' ')\n",
    "    \n",
    "    with open(r'D:\\分析实战\\京东评论文本挖掘\\data\\user_dict.txt', 'a', encoding='utf-8') as f:\n",
    "        f.write('\\n' + '\\n'.join(new_user_list))\n",
    "    with open('D:\\分析实战\\京东评论文本挖掘\\data\\correct_dict.txt', 'a', encoding='utf-8') as f:\n",
    "        f.write(r'\\n' + '\\n'.join(wordlist))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
